Background

The purpose of this workflow is to associate gene module expression (EOS, PMN) with host outcomes including:

  • fMRI
  • Lung function / asthma
  • Protein

Setup

Load packages

# Data manipulation and figures
library(tidyverse)
library(readxl)
library(cowplot)
library(tidytext)
# RNAseq data
library(limma)
#Correlation
library(corrplot)
#PLS
library(mixOmics)
# Print tty table to knit file
library(knitr)
library(kableExtra)
options(knitr.kable.NA = '')

`%notin%` <- Negate(`%in%`)

Set seed

set.seed(4389)

Load functions

source("scripts/corr.fxn.R")

Load data

load("data_clean/P337_BAL_data.RData")

Format data

Calculate V5 - V4

When applicable, correlations will be completed for delta values (V5-V4) to capture the paired nature of these data.

Module expression

Two donors are removed from BE expression data (MA1001 missing V4, MA1086 missing V5).

#Read in all module expression
mod.files <- list.files(path="results/module_level/", pattern="mod_voom_counts",
                        recursive = TRUE, full.names = TRUE)

counts.mod <- data.frame()
for(file in mod.files){
  counts.temp <- read_csv(file) %>% 
    pivot_longer(-c(module,module.char), names_to="libID")
  
  counts.mod <- rbind(counts.mod, counts.temp)
}
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   module = col_character(),
##   module.char = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   module = col_character(),
##   module.char = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
#Calculate delta
counts.mod.delta <- counts.mod %>% 
  #Remove modules 00
  filter(!grepl("_00",module)) %>% 
  dplyr::select(-module.char) %>% 
  #Add metadata to denote V4, V5
  full_join(dplyr::select(dat.BAL.abund.norm.voom$targets, 
                   libID, donorID, visit), by = "libID") %>% 
  #separate V4 and V5
  dplyr::select(-libID) %>% 
  pivot_wider(names_from = visit) %>% 
  mutate(delta = V5-V4) %>% 
  #shorten module names
  mutate(module=gsub("module_P337_", "", module)) %>% 
  #wide format
  dplyr::select(-V4,-V5) %>% 
  arrange(donorID, module) %>% 
  pivot_wider(names_from = module, values_from = delta)

Cytokine protein

plex.delta <- read_csv("data_raw/addtl.data/P337_BAL.multiplex.csv") %>% 
  rename(donorID=ptID, FGF2_V4=TGF2_V4) %>% 
  pivot_longer(-donorID) %>% 
  #Log10 transform
  mutate(value = ifelse(value==0,0, log10(value))) %>% 
  separate(name, into=c("name","visit"), sep="_") %>% 
  pivot_wider(names_from = visit) %>% 
  mutate(delta = V5-V4) %>% 
  drop_na(delta) %>% 
  dplyr::select(-V4,-V5) %>% 
  arrange(name) %>% 
  mutate(name = paste(name, "multiplex", sep=".")) %>% 
  pivot_wider(values_from = delta) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   ptID = col_character()
## )
## ℹ Use `spec()` for the full column specifications.

Check if cytokine genes are in expression modules as well.

Module Gene HGNC Cytokine
EOS_01 CCL13 CCL13
CCL2 CCL2
CCL22 CCL22
CCL26 CCL26
CCL7 CCL7
CXCL8 IL8
CCL17 TARC
TGFA TGFA
EOS_02 FLT3LG FLT3L
IL10 IL10
IL9 IL9
VEGFA VEGF
EOS_04 IL16 IL16
EOS_05 IL1RN IL1RA
EOS_06 CCL8 CCL8
CXCL1 CXCL1
IL13 IL13
IL6 IL6
CD40LG sCD40L
PMN_01 CCL13 CCL13
CCL2 CCL2
CCL7 CCL7
CCL8 CCL8
CXCL1 CXCL1
IL10 IL10
IL6 IL6
CXCL8 IL8
VEGFA VEGF

fMRI

Donor MA1012 is removed due to movement issues in fMRI.

#Time point 1 = visit 4
neuro <- read_excel(sheet="T1",
  "data_raw/addtl.data/extraced.clusters.Matt.Altman_wbaseline_psychdata.xlsx") %>% 
  #long format
  pivot_longer(-idnum, names_to="neuro") %>% 
  #add visit variable
  mutate(visit="V4")

#Time point 2 = visit 5
neuro <- read_excel(sheet="T2",
  "data_raw/addtl.data/extraced.clusters.Matt.Altman_wbaseline_psychdata.xlsx") %>% 
  #long format
  pivot_longer(-idnum, names_to="neuro") %>% 
  #add visit variable
  mutate(visit="V5") %>% 
  #Combine with other visit
  full_join(neuro) %>% 
  #Format idnum to match RNAseq data
  mutate(idnum = paste("MA",idnum, sep="")) %>% 
  rename(donorID=idnum)

neuro.delta <- neuro %>% 
  filter(donorID != "MA1012" & neuro != "LSI" & neuro != "BDI") %>% 
  #Calculate delta
  pivot_wider(names_from = visit) %>% 
  mutate(delta = V5-V4) %>% 
  #wide format
  dplyr::select(-V4,-V5) %>% 
  pivot_wider(names_from = neuro, values_from = delta)

Lung function and asthma

asthma.delta <- dat.BAL.abund.norm.voom$targets %>% 
  dplyr::select(donorID, FEV1.pctPP.preAlbuterol_V4:FeNO.PreBro_V5) %>% 
  distinct() %>% 
  #get visit from variable names
  pivot_longer(-donorID) %>% 
  separate(name, into=c("asthma","visit"), sep="_") %>% 
  #Calculate delta
  pivot_wider(names_from = visit) %>% 
  mutate(delta = V5-V4) %>% 
  #Wide format
  dplyr::select(-V4,-V5) %>% 
  pivot_wider(names_from = asthma, values_from = delta)

Partial least squares (PLS)

Regression mode: Y matrix is deflated with respect to information extracted/modeled from local regression on X. Here, the goal is to predict Y from X (and vice versa).

Sparse: simultaneous variable selection in X and Y with LASSO penalization on each pair of loading vectors

http://mixomics.org/methods/spls/

fMRI ~ module expression + cytokine protein

Format data

X <- counts.mod.delta %>% 
  filter(donorID %in% neuro.delta$donorID) %>% 
  full_join(plex.delta) %>% 
  column_to_rownames("donorID")
## Joining, by = "donorID"
X <- X[complete.cases(X),]
 
Y <- neuro.delta %>% 
  filter(donorID %in% rownames(X)) %>% 
  column_to_rownames("donorID")

Perform PLS regression.

ncomp = 10
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")  

#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")  

Tuning

Number of components

Compute evaluation criteria for (S)PLS.

pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
               progressBar = FALSE)

spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
                progressBar = FALSE)

Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.

Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~40%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 25%.

Taken together, 2 X and 3 Y components will be used.

Number of variables

Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).

#X variables
spls.MAE.X <- tune.spls(X, Y, 
                        ncomp = ncompX, #Determined above
                        test.keepX = c(2:30, 40),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 100, 
                        progressBar = FALSE, 
                        measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
                        ncomp = ncompY, #Determined above
                        test.keepX = c(2:10),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 10, 
                        progressBar = FALSE, 
                        measure = 'MAE')

#Save
dir.create("results/PLS/", showWarnings = FALSE)
save(spls.MAE.X, spls.MAE.Y,
     file="results/PLS/MAE.RData")

Based on minimum MAE, the number of variables to keep per component is

Space Component No. to keep
X 1 24
2 8
Y 1 9
2 10
3 8

Run (S)PLS

Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.

result.spls <- spls(X,Y, 
                    keepX = spls.MAE.X$choice.keepX,
                    keepY = spls.MAE.Y$choice.keepX,
                    ncomp = ncompX, mode = "regression")  

This results in the following variables across 2 components.

Correlations within SPLS

fMRI ~ module expression

Format data

X <- counts.mod.delta %>% 
  filter(donorID %in% neuro.delta$donorID) %>% 
  column_to_rownames("donorID")
X <- X[complete.cases(X),]
 
Y <- neuro.delta %>% 
  filter(donorID %in% rownames(X)) %>% 
  column_to_rownames("donorID")

Perform PLS regression.

ncomp = 9
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")  

#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")  

Tuning

Number of components

Compute evaluation criteria for (S)PLS.

pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
               progressBar = FALSE)

spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
                progressBar = FALSE)

Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.

Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~40%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 25%.

Taken together, 2 X and 2 Y components will be used.

Number of variables

Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).

#X variables
spls.MAE.X <- tune.spls(X, Y, 
                        ncomp = ncompX, #Determined above
                        test.keepX = c(2:9),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 100, 
                        progressBar = FALSE, 
                        measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
                        ncomp = ncompY, #Determined above
                        test.keepX = c(2:10),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 10, 
                        progressBar = FALSE, 
                        measure = 'MAE')

#Save
save(spls.MAE.X, spls.MAE.Y,
     file="results/PLS/MAE3.RData")

Based on minimum MAE, the number of variables to keep per component is

Space Component No. to keep
X 1 2
2 8
Y 1 10
2 2

Run (S)PLS

Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.

result.spls <- spls(X,Y, 
                    keepX = spls.MAE.X$choice.keepX,
                    keepY = spls.MAE.Y$choice.keepX,
                    ncomp = ncompX, mode = "regression")  

This results in the following variables across 2 components.

Correlations within SPLS

fMRI ~ cytokine multiplex

Format data

X <- plex.delta %>% 
  filter(donorID %in% neuro.delta$donorID) %>% 
  column_to_rownames("donorID")
X <- X[complete.cases(X),]
 
Y <- neuro.delta %>% 
  filter(donorID %in% rownames(X)) %>% 
  column_to_rownames("donorID")

Perform PLS regression.

ncomp = 10
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")  

#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")  

Tuning

Number of components

Compute evaluation criteria for (S)PLS.

pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
               progressBar = FALSE)

spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
                progressBar = FALSE)

Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.

Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~40%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 25%.

Taken together, 2 X and 2 Y components will be used.

Number of variables

Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).

#X variables
spls.MAE.X <- tune.spls(X, Y, 
                        ncomp = ncompX, #Determined above
                        test.keepX = c(2:30,40),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 100, 
                        progressBar = FALSE, 
                        measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
                        ncomp = ncompY, #Determined above
                        test.keepX = c(2:10),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 10, 
                        progressBar = FALSE, 
                        measure = 'MAE')

#Save
save(spls.MAE.X, spls.MAE.Y,
     file="results/PLS/MAE4.RData")

Based on minimum MAE, the number of variables to keep per component is

Space Component No. to keep
X 1 24
2 2
Y 1 9
2 10

Run (S)PLS

Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.

result.spls <- spls(X,Y, 
                    keepX = spls.MAE.X$choice.keepX,
                    keepY = spls.MAE.Y$choice.keepX,
                    ncomp = ncompX, mode = "regression")  

This results in the following variables across 2 components.

Correlations within SPLS

lung function ~ module expression + cytokine protein

Partial least squares (PLS)

Format data

X <- counts.mod.delta %>% 
  filter(donorID %in% asthma.delta$donorID) %>% 
  full_join(plex.delta) %>% 
  column_to_rownames("donorID")
## Joining, by = "donorID"
X <- X[complete.cases(X),]
 
Y <- asthma.delta %>% 
  filter(donorID %in% rownames(X)) %>% 
  column_to_rownames("donorID")

Perform PLS regression.

ncomp = 10
#PLS
pls <- pls(X, Y, ncomp = ncomp, mode = "regression")  

#SPLS
spls <- spls(X, Y, ncomp = ncomp, mode = "regression")  

Tuning

Number of components

Compute evaluation criteria for (S)PLS.

pls.Q2 <- perf(pls, validation = "Mfold", folds = 10, nrepeat = 100,
               progressBar = FALSE)

spls.Q2 <- perf(spls, validation = "Mfold", folds = 10, nrepeat = 100,
                progressBar = FALSE)

Q2 total is the sum of the quality of fit over all variables. In general, components should only continue to be added if Q2 < 0.0975 (red line). After that, there are diminishing returns to adding more components. Here, we would need to retain too many components by this general rule.

Thus, axis loadings will be assessed as well. Looking at the axis loadings (same for PLS and SPLS), component 1 explains the most variation (~45%) in X with further components contributing to a lesser extent (<10%). Y shows more even contributions from components 1-3 being > 20%.

Taken together, 2 X and 2 Y components will be used.

Number of variables

Compute mean absolute values (MAE) for variables on each component. (Only SPLS is assessed from this point).

#X variables
spls.MAE.X <- tune.spls(X, Y, 
                        ncomp = ncompX, #Determined above
                        test.keepX = c(2:30, 40),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 100, 
                        progressBar = FALSE, 
                        measure = 'MAE')
#Y variables
spls.MAE.Y <- tune.spls(Y, X,
                        ncomp = ncompY, #Determined above
                        test.keepX = c(2:3),
                        validation = "Mfold", 
                        folds = 10, nrepeat = 10, 
                        progressBar = FALSE, 
                        measure = 'MAE')

#Save
save(spls.MAE.X, spls.MAE.Y,
     file="results/PLS/MAE2.RData")

Based on minimum MAE, the number of variables to keep per component is

Space Component No. to keep
X 1 6
2 40
Y 1 3
2 2

Run (S)PLS

Using the parameters determined above, run the final (S)PLS regression. 2 components will be used based on tuning X.

result.spls <- spls(X,Y, 
                    keepX = spls.MAE.X$choice.keepX,
                    keepY = spls.MAE.Y$choice.keepX,
                    ncomp = ncompX, mode = "regression")  

This results in the following variables across 2 components.

Correlations within SPLS

R session

sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.3.1 knitr_1.30       mixOmics_6.12.2  lattice_0.20-41 
##  [5] MASS_7.3-53      corrplot_0.84    limma_3.44.3     tidytext_0.3.0  
##  [9] cowplot_1.1.1    readxl_1.3.1     forcats_0.5.0    stringr_1.4.0   
## [13] dplyr_1.0.3      purrr_0.3.4      readr_1.4.0      tidyr_1.1.2     
## [17] tibble_3.0.5     ggplot2_3.3.3    tidyverse_1.3.0 
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.2         viridisLite_0.3.0  jsonlite_1.7.2     ellipse_0.4.2     
##  [5] modelr_0.1.8       assertthat_0.2.1   highr_0.8          selectr_0.4-2     
##  [9] cellranger_1.1.0   yaml_2.2.1         pillar_1.4.7       backports_1.2.1   
## [13] glue_1.4.2         digest_0.6.27      RColorBrewer_1.1-2 rvest_0.3.6       
## [17] colorspace_2.0-0   htmltools_0.5.1    Matrix_1.3-2       plyr_1.8.6        
## [21] pkgconfig_2.0.3    broom_0.7.3        haven_2.3.1        corpcor_1.6.9     
## [25] webshot_0.5.2      scales_1.1.1       RSpectra_0.16-0    farver_2.0.3      
## [29] generics_0.1.0     ellipsis_0.3.1     withr_2.4.0        cli_2.2.0         
## [33] magrittr_2.0.1     crayon_1.3.4       evaluate_0.14      tokenizers_0.2.1  
## [37] janeaustenr_0.1.5  fs_1.5.0           fansi_0.4.2        SnowballC_0.7.0   
## [41] xml2_1.3.2         tools_4.0.2        hms_1.0.0          lifecycle_0.2.0   
## [45] matrixStats_0.57.0 munsell_0.5.0      reprex_0.3.0       compiler_4.0.2    
## [49] rlang_0.4.10       grid_4.0.2         rstudioapi_0.13    igraph_1.2.6      
## [53] labeling_0.4.2     rmarkdown_2.6      gtable_0.3.0       DBI_1.1.1         
## [57] rARPACK_0.11-0     reshape2_1.4.4     R6_2.5.0           gridExtra_2.3     
## [61] lubridate_1.7.9.2  stringi_1.5.3      parallel_4.0.2     Rcpp_1.0.6        
## [65] vctrs_0.3.6        dbplyr_2.0.0       tidyselect_1.1.0   xfun_0.20